source("utils.R")
library(sf)
library(spatialsample)
load("data/04.all_data.Rdata")
df <- df %>% mutate(poc_log = log(poc))
# CV set-up
n_folds <- 10
n_reps <- 1
# Explanatory variables
exp_vars <- df %>%
select(
temperature,
phosphate,
silicate,
alkalinity,
dic,
npp,
oxygen
) %>%
colnames()Spatial cross validation
Explore how spatial cross validation can be used to predict POC from env.
Stratified CV
Cross-validation stratified on quintiles (default) of the response variable.
Prepare data
# Cross-validation, 10 folds, stratified
set.seed(seed)
df_folds <- vfold_cv(df, v = n_folds, strata = poc_log, repeats = n_reps)
# Check POC distribution across folds
dist_folds <- lapply(1:nrow(df_folds), function(i){
x <- df_folds$splits[[i]]
if (ncol(df_folds) == 2){
fold <- df_folds$id[[i]]
} else {
rep <- df_folds$id[[i]]
fold <- df_folds$id2[[i]]
fold <- paste(rep, fold)
}
bind_rows(
analysis(x) %>% mutate(split = "analysis"),
assessment(x) %>% mutate(split = "assessment")
) %>%
mutate(fold = fold)
})
dist_folds <- do.call(bind_rows, dist_folds)
ggplot(dist_folds) +
geom_density(aes(x = poc_log, colour = fold, linetype = split), show.legend = FALSE) +
facet_wrap(~fold)ggplot(dist_folds) +
geom_density(aes(x = poc_log, colour = fold, linetype = split)) +
facet_wrap(~split, nrow = 2)Analysis (i.e. train) and assessment (i.e. test) distributions are very similar or differ depending on folds. We can expect some dispersion of R².
Train model
# Define a xgboost model with hyperparameters to tune
xgb_spec <- boost_tree(
trees = tune(),
tree_depth = tune(),
min_n = tune(),
learn_rate = tune()
) %>%
set_mode("regression") %>%
set_engine("xgboost", num.threads = n_cores)
## Recipe & Workflow
# Generate formula from list of explanatory variables
xgb_form <- as.formula(paste("poc_log ~ ", paste(c("poc", exp_vars), collapse = " + "), sep = ""))
xgb_rec <- recipe(xgb_form, data = df) %>%
update_role(poc, new_role = "untransformed outcome")
xgb_wflow <- workflow() %>%
add_recipe(xgb_rec) %>%
add_model(xgb_spec)
## Gridsearch
set.seed(seed)
xgb_grid <- grid_latin_hypercube(
trees(),
learn_rate(),
tree_depth(),
min_n(),
size = 30
)
doParallel::registerDoParallel()
set.seed(seed)
xgb_res <- tune_grid(
xgb_wflow,
resamples = df_folds,
grid = xgb_grid,
control = control_grid(save_pred = TRUE)
)
#autoplot(xgb_res)
best_xgb <- select_best(xgb_res, metric = "rmse") Evaluate model
## Finalize
final_xgb <- finalize_workflow(
xgb_wflow,
best_xgb
)
## Evaluate
final_res_cv <- fit_resamples(final_xgb, df_folds)
collect_metrics(final_res_cv)# A tibble: 2 × 6
.metric .estimator mean n std_err .config
<chr> <chr> <dbl> <int> <dbl> <chr>
1 rmse standard 0.330 10 0.0211 Preprocessor1_Model1
2 rsq standard 0.886 10 0.0139 Preprocessor1_Model1
cv_metrics <- collect_metrics(final_res_cv, summarize = F) %>%
filter(.metric == "rsq") %>%
mutate(cv_method = "Strat")
ggplot(cv_metrics) +
geom_boxplot(aes(x = .metric, y = .estimate, group = .metric)) +
geom_jitter(aes(x = .metric, y = .estimate, colour = id))Indeed, some dispersion of R².
Stratified CV, more constrained
Cross-validation stratified on deciles of the response variable.
Prepare data
# Cross-validation, 10 folds, stratified
set.seed(seed)
df_folds <- vfold_cv(df, v = n_folds, strata = poc_log, breaks = 9, repeats = n_reps)
# Check POC distribution across folds
dist_folds <- lapply(1:nrow(df_folds), function(i){
x <- df_folds$splits[[i]]
if (ncol(df_folds) == 2){
fold <- df_folds$id[[i]]
} else {
rep <- df_folds$id[[i]]
fold <- df_folds$id2[[i]]
fold <- paste(rep, fold)
}
bind_rows(
analysis(x) %>% mutate(split = "analysis"),
assessment(x) %>% mutate(split = "assessment")
) %>%
mutate(fold = fold)
})
dist_folds <- do.call(bind_rows, dist_folds)
ggplot(dist_folds) +
geom_density(aes(x = poc_log, colour = fold, linetype = split), show.legend = FALSE) +
facet_wrap(~fold)ggplot(dist_folds) +
geom_density(aes(x = poc_log, colour = fold, linetype = split)) +
facet_wrap(~split, nrow = 2)Analysis (i.e. train) and assessment (i.e. test) distributions are very similar and coherent across folds. We can expect low dispersion of R².
Train model
# Define a xgboost model with hyperparameters to tune
xgb_spec <- boost_tree(
trees = tune(),
tree_depth = tune(),
min_n = tune(),
learn_rate = tune()
) %>%
set_mode("regression") %>%
set_engine("xgboost", num.threads = n_cores)
## Recipe & Workflow
# Generate formula from list of explanatory variables
xgb_form <- as.formula(paste("poc_log ~ ", paste(c("poc", exp_vars), collapse = " + "), sep = ""))
xgb_rec <- recipe(xgb_form, data = df) %>%
update_role(poc, new_role = "untransformed outcome")
xgb_wflow <- workflow() %>%
add_recipe(xgb_rec) %>%
add_model(xgb_spec)
## Gridsearch
set.seed(seed)
xgb_grid <- grid_latin_hypercube(
trees(),
learn_rate(),
tree_depth(),
min_n(),
size = 30
)
doParallel::registerDoParallel()
set.seed(seed)
xgb_res <- tune_grid(
xgb_wflow,
resamples = df_folds,
grid = xgb_grid,
control = control_grid(save_pred = TRUE)
)
#autoplot(xgb_res)
best_xgb <- select_best(xgb_res, metric = "rmse") Evaluate model
## Finalize
final_xgb <- finalize_workflow(
xgb_wflow,
best_xgb
)
## Evaluate
final_res_cv <- fit_resamples(final_xgb, df_folds)
collect_metrics(final_res_cv)# A tibble: 2 × 6
.metric .estimator mean n std_err .config
<chr> <chr> <dbl> <int> <dbl> <chr>
1 rmse standard 0.335 10 0.0169 Preprocessor1_Model1
2 rsq standard 0.884 10 0.0119 Preprocessor1_Model1
cvmc_metrics <- collect_metrics(final_res_cv, summarize = F) %>%
filter(.metric == "rsq") %>%
mutate(cv_method = "Strat dec")
ggplot(cvmc_metrics) +
geom_boxplot(aes(x = .metric, y = .estimate, group = .metric)) +
geom_jitter(aes(x = .metric, y = .estimate, colour = id))Spatial block CV
Spatial cross validation based on block.
Prepare data
# Spatial cross-validation, 10 folds
df_sf <- st_as_sf(df, coords = c("lon", "lat"), crs = 4326)
set.seed(seed)
df_folds <- spatial_block_cv(df_sf, v = n_folds, repeats = n_reps)
autoplot(df_folds)# Check POC distribution across folds
dist_folds <- lapply(1:nrow(df_folds), function(i){
x <- df_folds$splits[[i]]
if (ncol(df_folds) == 2){
fold <- df_folds$id[[i]]
} else {
rep <- df_folds$id[[i]]
fold <- df_folds$id2[[i]]
fold <- paste(rep, fold)
}
bind_rows(
analysis(x) %>% mutate(split = "analysis"),
assessment(x) %>% mutate(split = "assessment")
) %>%
mutate(fold = fold)
})
dist_folds <- do.call(bind_rows, dist_folds)
ggplot(dist_folds) +
geom_density(aes(x = poc_log, colour = fold, linetype = split), show.legend = FALSE) +
facet_wrap(~fold)ggplot(dist_folds) +
geom_density(aes(x = poc_log, colour = fold, linetype = split)) +
facet_wrap(~split, nrow = 2)Analysis distributions are very coherent but assessment distributions vary a lot. We can expect higher dispersion of R².
Train model
# Define a xgboost model with hyperparameters to tune
xgb_spec <- boost_tree(
trees = tune(),
tree_depth = tune(),
min_n = tune(),
learn_rate = tune()
) %>%
set_mode("regression") %>%
set_engine("xgboost", num.threads = n_cores)
## Recipe & Workflow
# Generate formula from list of explanatory variables
xgb_form <- as.formula(paste("poc_log ~ ", paste(c("poc", exp_vars), collapse = " + "), sep = ""))
xgb_rec <- recipe(xgb_form, data = df) %>%
update_role(poc, new_role = "untransformed outcome")
xgb_wflow <- workflow() %>%
add_recipe(xgb_rec) %>%
add_model(xgb_spec)
## Gridsearch
set.seed(seed)
xgb_grid <- grid_latin_hypercube(
trees(),
learn_rate(),
tree_depth(),
min_n(),
size = 30
)
doParallel::registerDoParallel()
set.seed(seed)
xgb_res <- tune_grid(
xgb_wflow,
resamples = df_folds,
grid = xgb_grid,
control = control_grid(save_pred = TRUE)
)
best_xgb <- select_best(xgb_res, metric = "rmse") Evaluate model
## Finalize
final_xgb <- finalize_workflow(
xgb_wflow,
best_xgb
)
## Evaluate
final_res_scv <- fit_resamples(final_xgb, df_folds)
collect_metrics(final_res_scv)# A tibble: 2 × 6
.metric .estimator mean n std_err .config
<chr> <chr> <dbl> <int> <dbl> <chr>
1 rmse standard 0.470 10 0.0432 Preprocessor1_Model1
2 rsq standard 0.724 10 0.0637 Preprocessor1_Model1
sbcv_metrics <- collect_metrics(final_res_scv, summarize = F) %>%
filter(.metric == "rsq") %>%
mutate(cv_method = "Block")
ggplot(sbcv_metrics) +
geom_boxplot(aes(x = .metric, y = .estimate, group = .metric)) +
geom_jitter(aes(x = .metric, y = .estimate, colour = id))Spatial cluster CV
Spatial cross validation based on spatial clusters.
Prepare data
# Spatial cross-validation, 10 folds
df_sf <- st_as_sf(df, coords = c("lon", "lat"), crs = 4326)
set.seed(seed)
df_folds <- spatial_clustering_cv(df_sf, v = n_folds, repeats = n_reps)
autoplot(df_folds)# Check POC distribution across folds
dist_folds <- lapply(1:nrow(df_folds), function(i){
x <- df_folds$splits[[i]]
if (ncol(df_folds) == 2){
fold <- df_folds$id[[i]]
} else {
rep <- df_folds$id[[i]]
fold <- df_folds$id2[[i]]
fold <- paste(rep, fold)
}
bind_rows(
analysis(x) %>% mutate(split = "analysis"),
assessment(x) %>% mutate(split = "assessment")
) %>%
mutate(fold = fold)
})
dist_folds <- do.call(bind_rows, dist_folds)
ggplot(dist_folds) +
geom_density(aes(x = poc_log, colour = fold, linetype = split), show.legend = FALSE) +
facet_wrap(~fold)ggplot(dist_folds) +
geom_density(aes(x = poc_log, colour = fold, linetype = split)) +
facet_wrap(~split, nrow = 2)Once again, distribution varies a lot across assessment splits. Here the prediction task should be harder because each assessment fold is not represented at all in its corresponding analysis fold.
Train model
# Define a xgboost model with hyperparameters to tune
xgb_spec <- boost_tree(
trees = tune(),
tree_depth = tune(),
min_n = tune(),
learn_rate = tune()
) %>%
set_mode("regression") %>%
set_engine("xgboost", num.threads = n_cores)
## Recipe & Workflow
# Generate formula from list of explanatory variables
xgb_form <- as.formula(paste("poc_log ~ ", paste(c("poc", exp_vars), collapse = " + "), sep = ""))
xgb_rec <- recipe(xgb_form, data = df) %>%
update_role(poc, new_role = "untransformed outcome")
xgb_wflow <- workflow() %>%
add_recipe(xgb_rec) %>%
add_model(xgb_spec)
## Gridsearch
set.seed(seed)
xgb_grid <- grid_latin_hypercube(
trees(),
learn_rate(),
tree_depth(),
min_n(),
size = 30
)
doParallel::registerDoParallel()
set.seed(seed)
xgb_res <- tune_grid(
xgb_wflow,
resamples = df_folds,
grid = xgb_grid,
control = control_grid(save_pred = TRUE)
)
best_xgb <- select_best(xgb_res, metric = "rmse") Evaluate model
## Finalize
final_xgb <- finalize_workflow(
xgb_wflow,
best_xgb
)
## Evaluate
final_res_scv <- fit_resamples(final_xgb, df_folds)
collect_metrics(final_res_scv)# A tibble: 2 × 6
.metric .estimator mean n std_err .config
<chr> <chr> <dbl> <int> <dbl> <chr>
1 rmse standard 0.626 10 0.0963 Preprocessor1_Model1
2 rsq standard 0.554 10 0.102 Preprocessor1_Model1
sccv_metrics <- collect_metrics(final_res_scv, summarize = F) %>%
filter(.metric == "rsq") %>%
mutate(cv_method = "Clust")
ggplot(sccv_metrics) +
geom_boxplot(aes(x = .metric, y = .estimate, group = .metric)) +
geom_jitter(aes(x = .metric, y = .estimate, colour = id))Overall comparison of R²
cv_metrics %>%
bind_rows(cvmc_metrics) %>%
bind_rows(sbcv_metrics) %>%
bind_rows(sccv_metrics) %>%
mutate(cv_method = fct_inorder(cv_method)) %>%
ggplot() +
geom_boxplot(aes(x = cv_method, y = .estimate, group = cv_method))